# -*- coding: utf-8 -*-
"""
Code for whether section is reintroduced in next congressional term

First crated: 4/21/23
Final edit: 8/15/23

"""

import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import time
import os
import csv

cong = [103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116]

for i in cong:
   os.chdir('/replication/reintroduced/')
   num = str(i)
   header=['bill1','sec1','raw_txt1','clean_txt1', 'date1','bill2','sec2','raw_txt2','clean_txt2', 'date2','txt1_ad_5', 'txt2_ad_5', 'scope_10', 'num_blocks', 'perblock_txt1', 'perblock_txt2', "first_shared_keys_2", "first_txt1_ad_2", "first_txt2_ad_2", 'match', 'x', 'y']
   with open('secs'+num+'reintroduced.csv', 'w', encoding='UTF8', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(header)

for i in cong:
   num = str(i)
   num2 = str(i + 1)
   os.chdir('/replication/enacted_sections/')
   secs_enacted = pd.read_csv('secs'+num+'_enacted.csv')
   secs_enacted['section'] = secs_enacted['sec1']
   secs_enactreduce = secs_enacted[['sec1', 'txt1_ad_5', 'txt2_ad_5', 'scope_10', 'num_blocks', 'perblock_txt1', 'perblock_txt2', 'first_shared_keys_2', 'first_txt1_ad_2', 'first_txt2_ad_2']]
   all_enacted = secs_enactreduce.groupby(['sec1'], as_index=False).max()
   all_enacted = all_enacted.rename(columns={'sec1': 'section'})
   
   os.chdir('/replication/cleaned_sections/'+num+'/')
   secs_all = pd.read_csv(num+"_sections_noboilerplate.csv")
   secs_all = secs_all.drop(['Unnamed: 0'], axis=1)
   secs_all = secs_all[secs_all['bill'].str.contains("ih|is")]
   secs_new = pd.merge(secs_all, all_enacted, on='section', how='left')
   
   secs_new['def_enacted'] = np.where((secs_new['txt1_ad_5']>=0.9) & (secs_new['txt2_ad_5']>=0.9) & (secs_new['perblock_txt2']>=0.9) & (secs_new['first_txt1_ad_2']>=0.9) & (secs_new['first_txt2_ad_2']>=0.9), 1, 0)
   secs_new['enacted'] = np.where((secs_new['num_blocks']>=1), 1, 0)

   #Bring in next term sections 
   os.chdir('/replication/cleaned_sections/'+num2+'/')
   secs_nextterm = pd.read_csv(num2+"_sections_noboilerplate.csv")
   secs_nextterm = secs_nextterm.drop(['Unnamed: 0'], axis=1)
   secs_nextterm = secs_nextterm[secs_nextterm['bill'].str.contains("ih|is|enr|rds|eh|es|pcs|cps|cph|ath|ats|eas|eah")]
   secs_nextterm['date'] = pd.to_datetime(secs_nextterm['date'])
   secs_nextterm.sort_values(['date'])

   #Code to determine if section not enacted last term is enacted in the next term
   secs_current_list= secs_new.values.tolist()
   secs_nextterm_list = secs_nextterm.values.tolist()
   counter1 = len(secs_current_list)
   counter2 = len(secs_nextterm_list) 
   
   os.chdir('/replication/reintroduced/')
  
   #Match previously unenacted sections with new sections
   def pairwise_dice(num1, txts1, num2, txts2):       
       for x in range(num1):
         if isinstance(txts1[x][3], float)==True: pass
         else:
             print(num, x)
             for y in range(num2):
                 new_sect = []
                 if isinstance(txts2[y][3], float)==True: pass
                 else:
                     dice_comp = dice(txts1[x][3],txts2[y][3]) 
                     if dice_comp<=0.5: pass
                     else:
                         score5 = gen_hash(txts1[x][3],txts2[y][3],5,txts1[x][0],txts2[y][0]) 
                         score10 = gen_hash(txts1[x][3],txts2[y][3],10,txts1[x][0],txts2[y][0]) 
                         block1 = blocks(txts1[x][3],txts2[y][3])
                         score_first2 = gen_hash_first100(txts1[x][3],txts2[y][3],5,txts1[x][0],txts2[y][0])
                         new_sect.append(txts1[x][0]) #sec1
                         new_sect.append(txts1[x][1]) #sec1
                         if len(txts1[x][2]) < 30000: new_sect.append(txts1[x][2]) #raw
                         else: new_sect.append('')
                         if len(txts1[x][3]) < 30000: new_sect.append(txts1[x][3]) #processed
                         else: new_sect.append('') 
                         new_sect.append(txts1[x][5]) #date
                         new_sect.append(txts2[y][0]) #sec2
                         new_sect.append(txts2[y][1]) #sec2
                         if len(txts2[y][2]) < 30000: new_sect.append(txts2[y][2]) #raw
                         else: new_sect.append('') 
                         if len(txts2[y][3]) < 30000: new_sect.append(txts2[y][3]) #processed
                         else: new_sect.append('') 
                         new_sect.append(txts2[y][5]) #date
                         new_sect.append(score5[3])
                         new_sect.append(score5[4])
                         new_sect.append(score10[5])
                         new_sect.append(block1[1])
                         new_sect.append(block1[4])
                         new_sect.append(block1[5])
                         new_sect.append(score_first2[2])
                         new_sect.append(score_first2[3])
                         new_sect.append(score_first2[4])
                         x_full = np.asarray([[new_sect[10], new_sect[11], new_sect[12], new_sect[13], new_sect[14], new_sect[15], new_sect[16], new_sect[17], new_sect[18]]])
                         y_full = clf_model_minor.predict(x_full)
                         y_full_int = y_full[0]
                         new_sect.append(y_full_int)
                         if y_full_int==1:
                             new_sect.append(x)
                             new_sect.append(y)
                             print(num,x,y)
                             with open('secs'+num+'_reintroduced.csv', 'a', encoding='UTF8', newline='') as f:
                                 writer = csv.writer(f)
                                 writer.writerow(new_sect)
                         else: pass
                     
   start = time.process_time()
   comparisons = pairwise_dice(counter1, secs_current_list, counter2, secs_nextterm_list)
   print(time.process_time() - start)